This document contains the exploratory data analysis (EDA) for the
Airbnb Berlin rental demand prediction project. We’re analyzing the
preprocessed data that was prepared by the
01_data_preprocessing.R script, which includes a clean and
consistent set of features as well as our target variable
demand_proxy.
# Set the file path to the processed data
processed_data_path <- "../data/processed/"
# Load the train and test datasets
train_data <- read.csv(paste0(processed_data_path, "train_berlin_clean.csv"))
test_data <- read.csv(paste0(processed_data_path, "test_berlin_clean.csv"))
# Display basic information about the datasets
cat("Train dataset dimensions:", dim(train_data)[1], "rows,", dim(train_data)[2], "columns\n")## Train dataset dimensions: 15692 rows, 43 columns
## Test dataset dimensions: 7842 rows, 38 columns
##
## Target variable (demand_proxy) statistics:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.001179 0.130896 0.201334 0.270047 0.875000
# Check data types for key columns
str(train_data[, c("listing_id", "neighbourhood", "property_type", "room_type",
"price", "reviews", "demand_proxy")])## 'data.frame': 15692 obs. of 7 variables:
## $ listing_id : int 19665213 6436842 10559468 27215482 27287546 26590915 32996974 17364275 23775462 13483316 ...
## $ neighbourhood: chr "Prenzlauer Berg" "Pankow" "Prenzlauer Berg" "Friedrichshain" ...
## $ property_type: chr "Apartment" "Apartment" "Apartment" "Apartment" ...
## $ room_type : chr "Private room" "Entire home/apt" "Entire home/apt" "Private room" ...
## $ price : int 26 41 50 50 55 39 94 73 100 50 ...
## $ reviews : int 6 6 2 4 0 10 5 14 13 2 ...
## $ demand_proxy : num 0.257 0.257 0.127 0.13 0 ...
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 8.00 32.00 49.00 60.34 70.00 900.00 9
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 5.00 19.45 17.00 424.00
Our target variable demand_proxy was created during
preprocessing as a composite measure based on review counts and
normalized scores.
# Distribution of demand proxy
ggplot(train_data, aes(x = demand_proxy)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Demand Proxy",
x = "Demand Proxy (0-1 scale)",
y = "Count") +
theme_minimal()# Box plot of demand proxy values
ggplot(train_data, aes(y = demand_proxy)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
labs(title = "Boxplot of Demand Proxy Values",
y = "Demand Proxy (0-1 scale)") +
theme_minimal()# Relationship between demand proxy and price
ggplot(train_data, aes(x = price, y = demand_proxy)) +
geom_point(alpha = 0.2, color = "steelblue") +
labs(title = "Relationship Between Price and Demand Proxy",
x = "Price (Euro)",
y = "Demand Proxy") +
theme_minimal()# Price distribution
ggplot(train_data, aes(x = price)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Airbnb Prices in Berlin",
x = "Price (Euro)",
y = "Count") +
theme_minimal()# Price boxplot
ggplot(train_data, aes(y = price)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
labs(title = "Boxplot of Airbnb Prices in Berlin",
y = "Price (Euro)") +
theme_minimal()# Create a leaflet map
leaflet_map <- leaflet(train_data) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = 2,
color = "blue",
fillOpacity = 0.5,
popup = ~paste("Price:", price, "<br>",
"Room Type:", room_type)
) %>%
addControl(html = "<b>Airbnb Listings in Berlin</b>", position = "topright")
# Display the map
leaflet_map# Create bins for price
price_breaks <- c(0, 50, 100, 150, 200, Inf)
price_labels <- c("< 50", "50-100", "100-150", "150-200", "> 200")
train_data$price_bin <- cut(train_data$price,
breaks = price_breaks,
labels = price_labels)
# Color palette
price_pal <- colorFactor(
palette = c("green", "blue", "purple", "orange", "red"),
domain = train_data$price_bin
)
# Create a leaflet map with price colors
price_map <- leaflet(train_data) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = 2,
color = ~price_pal(price_bin),
fillOpacity = 0.7,
popup = ~paste("Price:", price, "<br>",
"Room Type:", room_type)
) %>%
addLegend("bottomright",
pal = price_pal,
values = ~price_bin,
title = "Price (Euro)",
opacity = 1)
# Display the map
price_map# Count listings by neighborhood
neighborhood_counts <- table(train_data$neighbourhood)
neighborhood_counts <- sort(neighborhood_counts, decreasing = TRUE)
top_neighborhoods <- head(neighborhood_counts, 15)
# Plot top neighborhoods
barplot(top_neighborhoods,
horiz = TRUE,
col = "steelblue",
main = "Top 15 Neighborhoods by Number of Listings",
xlab = "Number of Listings",
las = 1)# Average price by neighborhood
neighborhood_prices <- aggregate(price ~ neighbourhood, data = train_data, FUN = mean)
neighborhood_prices <- neighborhood_prices[order(-neighborhood_prices$price), ]
top_price_neighborhoods <- head(neighborhood_prices, 15)
# Plot top neighborhoods by price
barplot(top_price_neighborhoods$price,
names.arg = top_price_neighborhoods$neighbourhood,
horiz = TRUE,
col = "steelblue",
main = "Top 15 Neighborhoods by Average Price",
xlab = "Average Price (Euro)",
las = 1)# Distribution of room types
room_counts <- table(train_data$room_type)
room_counts <- sort(room_counts, decreasing = TRUE)
# Plot room types
barplot(room_counts,
horiz = TRUE,
col = "steelblue",
main = "Distribution of Room Types",
xlab = "Count",
las = 1)# Analyze average price by room type
room_prices <- aggregate(price ~ room_type, data = train_data, FUN = mean)
room_prices <- room_prices[order(-room_prices$price), ]
# Plot price by room type
barplot(room_prices$price,
names.arg = room_prices$room_type,
col = "steelblue",
main = "Average Price by Room Type",
ylab = "Average Price (Euro)",
las = 1)# Boxplot of price by room type
boxplot(price ~ room_type, data = train_data,
col = "steelblue",
main = "Price Distribution by Room Type",
ylab = "Price (Euro)",
xlab = "")# Boxplot of demand proxy by room type
boxplot(demand_proxy ~ room_type, data = train_data,
col = "steelblue",
main = "Demand by Room Type",
ylab = "Demand Proxy",
xlab = "")Based on our exploratory data analysis, we’ve identified several key insights:
Demand Distribution: The demand proxy shows a right-skewed distribution, with most properties having low to moderate demand and fewer properties having very high demand.
Price Patterns:
Geographic Patterns:
Neighborhood Insights:
Property Characteristics:
These insights will guide our feature selection and modeling approach in the next phase of the project.